{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "32e57a15",
   "metadata": {},
   "outputs": [],
   "source": [
    "# only run this if your have an editable install\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e510ea2a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset wiki_qa (/home/jjmachan/.cache/huggingface/datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['question_id', 'question', 'document_title', 'answer', 'label'],\n",
       "    num_rows: 6165\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset, concatenate_datasets\n",
    "\n",
    "DATASET_URL = \"wiki_qa\"\n",
    "SPLIT = \"test\"\n",
    "ds = load_dataset(DATASET_URL, split=SPLIT)\n",
    "ds"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "131cb2c0",
   "metadata": {},
   "source": [
    "### Download the wikipedia pages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c9842ef1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import wikipedia as wiki\n",
    "\n",
    "wiki_doc_titles = sorted(set(ds[\"document_title\"]))\n",
    "\n",
    "\n",
    "def get_wiki_doc(title):\n",
    "    assert isinstance(title, str)\n",
    "    try:\n",
    "        doc = wiki.WikipediaPage(title=title)\n",
    "    except (wiki.PageError, wiki.DisambiguationError) as e:\n",
    "        return False\n",
    "    with open(f\"./data/{title}.txt\", \"w\") as f:\n",
    "        f.write(doc.content)\n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "3d64abcc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 20%|███████████▏                                            | 124/619 [04:55<11:16,  1.37s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Crater lake] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 23%|████████████▎                                         | 141/619 [07:28<3:47:03, 28.50s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Deep Blue Sea] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 37%|████████████████████▉                                   | 232/619 [11:44<09:56,  1.54s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Hannibal (film)] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 43%|████████████████████████                                | 266/619 [14:43<08:17,  1.41s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[IÂ²C] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 46%|█████████████████████████▌                              | 283/619 [15:06<07:38,  1.36s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[June bug] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 48%|██████████████████████████▋                             | 295/619 [15:22<06:13,  1.15s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[La NiÃ±a] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 53%|█████████████████████████████▌                          | 327/619 [16:01<05:20,  1.10s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[List of youngest birth mothers] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 66%|████████████████████████████████████▉                   | 408/619 [17:48<04:27,  1.27s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Our Song] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 66%|█████████████████████████████████████▏                  | 411/619 [17:52<04:28,  1.29s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Parcel] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 69%|██████████████████████████████████████▌                 | 426/619 [18:41<33:02, 10.27s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[PokÃ©mon] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 73%|████████████████████████████████████████▋               | 450/619 [19:14<04:22,  1.55s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Range (mathematics)] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 80%|████████████████████████████████████████████▋           | 494/619 [20:17<02:59,  1.44s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Sixth Army] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 82%|█████████████████████████████████████████████▊          | 506/619 [21:37<08:56,  4.75s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Spades] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 86%|████████████████████████████████████████████████▎       | 534/619 [22:16<01:50,  1.30s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Systemic] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 87%|████████████████████████████████████████████████▌       | 537/619 [22:20<01:36,  1.18s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Tamari] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 89%|██████████████████████████████████████████████████      | 553/619 [22:42<01:26,  1.32s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[The Bells] failed!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████| 619/619 [24:20<00:00,  2.36s/it]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "failed = []\n",
    "for t in tqdm(wiki_doc_titles):\n",
    "    r = get_wiki_doc(t)\n",
    "    if r is False:\n",
    "        failed.append(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "bbea4697",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Crater lake',\n",
       " 'Deep Blue Sea',\n",
       " 'Hannibal (film)',\n",
       " 'IÂ²C',\n",
       " 'June bug',\n",
       " 'La NiÃ±a',\n",
       " 'List of youngest birth mothers',\n",
       " 'Our Song',\n",
       " 'Parcel',\n",
       " 'PokÃ©mon',\n",
       " 'Range (mathematics)',\n",
       " 'Sixth Army',\n",
       " 'Spades',\n",
       " 'Systemic',\n",
       " 'Tamari',\n",
       " 'The Bells']"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "failed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "5c6dd6b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open(\"failed_wikis\", \"w\") as f:\n",
    "    json.dump(failed, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3004862c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "16"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "with open(\"failed_wikis\") as f:\n",
    "    failed = json.load(f)\n",
    "\n",
    "len(failed)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ed636b5f",
   "metadata": {},
   "source": [
    "## Clean Dataset\n",
    "\n",
    "- remove the failed wiki's and questions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f01c62c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_failed_and_incorrect(row):\n",
    "    if row[\"document_title\"] in failed:\n",
    "        return False\n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "8d4ec036",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at /home/jjmachan/.cache/huggingface/datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c/cache-61607228d22e9a55.arrow\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(6049, 5)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cleaned_ds1 = ds.filter(clean_failed_and_incorrect, batched=False)\n",
    "cleaned_ds1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b47bb412",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "603"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "count_wiki_questions = {\n",
    "    item: cleaned_ds1[\"document_title\"].count(item)\n",
    "    for item in set(cleaned_ds1[\"document_title\"])\n",
    "}\n",
    "len(count_wiki_questions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "657e4c4d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "603"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(cleaned_ds1[\"document_title\"]))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20055528",
   "metadata": {},
   "source": [
    "## baseline\n",
    "\n",
    "first load the selected documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "4408c710",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "603"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from llama_index import Document\n",
    "\n",
    "docs = []\n",
    "for d in sorted(set(cleaned_ds1[\"document_title\"])):\n",
    "    with open(f\"./data/{d}.txt\") as f:\n",
    "        docs.append(Document(text=f.read()))\n",
    "\n",
    "len(docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "dc66ca8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index import GPTVectorStoreIndex\n",
    "from langchain.embeddings.huggingface import HuggingFaceEmbeddings\n",
    "from llama_index import LangchainEmbedding, ServiceContext, StorageContext\n",
    "\n",
    "# load in HF embedding model from langchain\n",
    "embed_model = LangchainEmbedding(HuggingFaceEmbeddings())\n",
    "hf_sc = ServiceContext.from_defaults(embed_model=embed_model)\n",
    "\n",
    "# openai embeddings\n",
    "openai_sc = ServiceContext.from_defaults()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "d8620a8e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "16966"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make nodes\n",
    "from llama_index.node_parser import SimpleNodeParser\n",
    "from langchain.text_splitter import TokenTextSplitter\n",
    "\n",
    "spliter = TokenTextSplitter(chunk_size=300, chunk_overlap=50)\n",
    "\n",
    "parser = SimpleNodeParser(text_splitter=spliter)\n",
    "\n",
    "nodes = parser.get_nodes_from_documents(documents=docs)\n",
    "\n",
    "len(nodes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "8631f4ac",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Node(text='The 18th century lasted from January 1, 1701 (MDCCI) to December 31, 1800 (MDCCC). During the 18th century, elements of Enlightenment thinking culminated in the American, French, and Haitian Revolutions. During the century, slave trading and human trafficking expanded across the shores of the Atlantic, while declining in Russia, China, and Korea. Revolutions began to challenge the legitimacy of monarchical and aristocratic power structures, including the structures and beliefs that supported slavery. The Industrial Revolution began during mid-century, leading to radical changes in human society and the environment. \\nWestern historians have occasionally defined the 18th century otherwise for the purposes of their work. For example, the \"short\" 18th century may be defined as 1715–1789, denoting the period of time between the death of Louis XIV of France and the start of the French Revolution, with an emphasis on directly interconnected events. To historians who expand the century to include larger historical movements, the \"long\" 18th century may run from the Glorious Revolution of 1688 to the Battle of Waterloo in 1815 or even later.The period is also known as the \"century of lights\" or the \"century of reason\". In continental Europe, philosophers dreamed of a brighter age. For some, this dream turned into a reality with the French Revolution of 1789, though this was later compromised by the excesses of the Reign of Terror. At first, many monarchies of Europe embraced', doc_id='aa92390b-4adf-43b9-a4c6-d7decf31e7e7', embedding=None, doc_hash='a443487df4275d0b829af8621aabe669945cf6a1454c0c52469b02583fbb4461', extra_info=None, node_info=None, relationships={<DocumentRelationship.SOURCE: '1'>: '782f1ba7-396e-45e1-8753-5ca7087af322', <DocumentRelationship.NEXT: '3'>: '2ebde8b8-aba6-4f76-bf33-0765da173ffa'})"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nodes[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "91980b5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# CHANGE SERVICE_CONTEXT HERE!!!\n",
    "service_context = openai_sc\n",
    "\n",
    "# create index\n",
    "index = GPTVectorStoreIndex.from_documents(\n",
    "    documents=docs,\n",
    "    service_context=service_context,\n",
    ")\n",
    "\n",
    "# query with embed_model specified\n",
    "qe = index.as_query_engine(\n",
    "    mode=\"embedding\", verbose=True, service_context=service_context\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "27af8b67",
   "metadata": {},
   "outputs": [],
   "source": [
    "# save the index\n",
    "index.storage_context.persist(persist_dir=\"./storage\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a340bfbf",
   "metadata": {},
   "source": [
    "when loading make sure `service_context` is initialized and configured the same"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "789167b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the index\n",
    "from llama_index import StorageContext, load_index_from_storage, ServiceContext\n",
    "\n",
    "# CHANGE SERVICE_CONTEXT HERE!!!\n",
    "openai_sc = ServiceContext.from_defaults()\n",
    "service_context = openai_sc\n",
    "\n",
    "# rebuild storage context\n",
    "storage_context = StorageContext.from_defaults(persist_dir=\"./storage\")\n",
    "\n",
    "# load index\n",
    "index = load_index_from_storage(storage_context)\n",
    "\n",
    "# query with embed_model specified\n",
    "qe = index.as_query_engine(\n",
    "    mode=\"embedding\", verbose=True, service_context=service_context, use_async=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "0622653f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "The Mohawks live in settlements in northern New York State and southeastern Canada, including the reserves of Kanièn:ke, Kanaʼtsioharè:ke, Ahkwesáhsne, Kahnawà:ke, Kanehsatà:ke, Tioweró:ton, Kenhtè꞉ke, Wáhta, and Ohswé:ken.\n"
     ]
    }
   ],
   "source": [
    "from llama_index import (\n",
    "    GPTVectorStoreIndex,\n",
    "    ResponseSynthesizer,\n",
    ")\n",
    "from llama_index.retrievers import VectorIndexRetriever\n",
    "from llama_index.query_engine import RetrieverQueryEngine\n",
    "from llama_index.indices.postprocessor import SimilarityPostprocessor\n",
    "\n",
    "# configure retriever\n",
    "retriever = VectorIndexRetriever(\n",
    "    index=index,\n",
    "    similarity_top_k=3,\n",
    ")\n",
    "\n",
    "# configure response synthesizer\n",
    "response_synthesizer = ResponseSynthesizer.from_args(\n",
    "    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)]\n",
    ")\n",
    "\n",
    "# assemble query engine\n",
    "query_engine = RetrieverQueryEngine(\n",
    "    retriever=retriever,\n",
    "    response_synthesizer=response_synthesizer,\n",
    ")\n",
    "\n",
    "# query\n",
    "response = query_engine.query(\"where do the mohawks live?\")\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "f83553c4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\nThe Mohawks live in settlements in northern New York State and southeastern Canada, including the reserves of Kanièn:ke, Kanaʼtsioharè:ke, Ahkwesáhsne, Kahnawà:ke, Kanehsatà:ke, Tioweró:ton, Kenhtè꞉ke, Wáhta, and Ohswé:ken.'"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response.response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "3b20a21f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('Q2004',\n",
       " 'who is flo from progressive',\n",
       " 'Flo debuted in 2008 through television commercials and has since appeared in radio advertisements and web banners .')"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "i = 45\n",
    "q_id, q, a = final_ds[i][\"question_id\"], final_ds[i][\"question\"], final_ds[i][\"answer\"]\n",
    "q_id, q, a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "68e5b00f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import Markdown, display\n",
    "\n",
    "r = qe.query(q)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "372b2763",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "\n",
       "<b>Q1506: where do the mohawks live</b>\n",
       "\n",
       "<b>Generated Answer: </b>\n",
       "<i>\n",
       "The Mohawks live in settlements in northern New York State and southeastern Canada, including Kanièn:ke, Kanaʼtsioharè:ke, Ahkwesáhsne, Kahnawà:ke, Kanehsatà:ke, Tioweró:ton, Kenhtè꞉ke, Wáhta, and Ohswé:ken.</i>\n",
       "\n",
       "<b>Original Answer: </b>\n",
       "<i>Their traditional homeland stretched southward of the Mohawk River , eastward to the Green Mountains of Vermont , westward to the border with the Oneida Nation 's traditional homeland territory, and northward to the St Lawrence River.</i>\n"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(\n",
    "    Markdown(\n",
    "        f\"\"\"\n",
    "<b>{q_id}: {q}</b>\n",
    "\n",
    "<b>Generated Answer: </b>\n",
    "<i>{r}</i>\n",
    "\n",
    "<b>Original Answer: </b>\n",
    "<i>{a}</i>\n",
    "\"\"\"\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "8e6cd3eb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question_id': 'Q33',\n",
       " 'question': 'how are antibodies used in',\n",
       " 'document_title': 'antibody',\n",
       " 'answer': 'An antibody (Ab), also known as an immunoglobulin (Ig), is a large Y-shaped protein produced by B-cells that is used by the immune system to identify and neutralize foreign objects such as bacteria and viruses .',\n",
       " 'label': 1,\n",
       " 'generated_text': 'research?\\n\\nAntibodies are used in research to identify and locate intracellular and extracellular proteins, differentiate cell types by the proteins they express, separate proteins and anything bound to them (co-immunoprecipitation) from other molecules in a cell lysate, identify proteins separated by electrophoresis, examine protein expression in tissue sections or to locate proteins within cells with the assistance of a microscope, detect and quantify proteins with ELISA and ELISpot techniques, and act as a guide for drugs to reach their target.',\n",
       " 'retrieved_context': ['by dilution cloning to generate cell clones that all produce the same antibody; these antibodies are called monoclonal antibodies. Polyclonal and monoclonal antibodies are often purified using Protein A/G or antigen-affinity chromatography.In research, purified antibodies are used in many applications. Antibodies for research applications can be found directly from antibody suppliers, or through use of a specialist search engine. Research antibodies are most commonly used to identify and locate intracellular and extracellular proteins. Antibodies are used in flow cytometry to differentiate cell types by the proteins they express; different types of cells express different combinations of cluster of differentiation molecules on their surface, and produce different intracellular and secretable proteins. They are also used in immunoprecipitation to separate proteins and anything bound to them (co-immunoprecipitation) from other molecules in a cell lysate, in Western blot analyses to identify proteins separated by electrophoresis, and in immunohistochemistry or immunofluorescence to examine protein expression in tissue sections or to locate proteins within cells with the assistance of a microscope. Proteins can also be detected and quantified with antibodies, using ELISA and ELISpot techniques.Antibodies used in research are some of the most powerful, yet most problematic reagents with a tremendous number of factors that must be controlled in any experiment including cross reactivity, or the antibody recognizing multiple epitopes and affinity, which can vary widely depending on experimental conditions such as pH, solvent, state of tissue etc. Multiple attempts have been made to improve both the way that researchers validate antibodies and ways in which they report on antibodies. Researchers using antibodies in their work need to record them correctly in order to allow their research to be reproducible (and therefore tested, and qualified by other researchers). Less than half of research antibodies referenced in academic papers can be easily identified. Papers published in F1000 in 2014 and 2015 provide researchers with a guide for reporting research antibody use. The RRID paper, is co-published in 4 journals that implemented the RRIDs Standard for research resource citation, which draws data from the antibodyregistry.org as the source of antibody identifiers (see also group at Force11).\\nAntibody regions can be used to further biomedical research by acting as a guide for drugs to reach their target. Several application involve using bacterial plasmids to tag plasmids with the Fc region of the antibody such as pFUSE-Fc plasmid.\\n\\n\\n== Regulations ==\\n\\n\\n=== Production and testing ===\\nTraditionally, most antibodies are produced by hybridoma cell lines through immortalization of antibody-producing cells by chemically-induced fusion with myeloma cells. In some cases, additional fusions with other lines have created \"triomas\" and \"quadromas\". The manufacturing process should be appropriately described and validated. Validation studies should at least include:\\n\\nThe demonstration that the process is able to produce in good quality (the process should be validated)\\nThe efficiency of the antibody purification (all impurities and virus must be eliminated)\\nThe characterization of purified antibody (physicochemical characterization, immunological properties, biological activities, contaminants, ...)\\nDetermination of the virus clearance studies\\n\\n\\n=== Before clinical trials ===\\nProduct safety testing: Sterility (bacteria and fungi), in vitro and in vivo testing for adventitious viruses, murine retrovirus testing..., product safety data needed before the initiation of feasibility trials in serious or immediately life-threatening conditions, it serves to evaluate dangerous potential of the product.\\nFeasibility testing: These are pilot studies whose objectives include, among others, early characterization of safety and initial proof of concept in a small specific patient population (in vitro or in vivo testing).\\n\\n\\n=== Preclinical studies ===\\nTesting cross-reactivity of antibody: to highlight unwanted interactions (toxicity) of antibodies with previously characterized tissues. This study can be',\n",
       "  'chemistry enables radioactive fluoride (18F) labeling of antibodies, which allows for positron emission tomography (PET) imaging of cancer.\\n\\n\\n=== Disease therapy ===\\nTargeted monoclonal antibody therapy is employed to treat diseases such as rheumatoid arthritis, multiple sclerosis, psoriasis, and many forms of cancer including non-Hodgkin\\'s lymphoma, colorectal cancer, head and neck cancer and breast cancer.Some immune deficiencies, such as X-linked agammaglobulinemia and hypogammaglobulinemia, result in partial or complete lack of antibodies. These diseases are often treated by inducing a short-term form of immunity called passive immunity. Passive immunity is achieved through the transfer of ready-made antibodies in the form of human or animal serum, pooled immunoglobulin or monoclonal antibodies, into the affected individual.\\n\\n\\n=== Prenatal therapy ===\\nRh factor, also known as Rh D antigen, is an antigen found on red blood cells; individuals that are Rh-positive (Rh+) have this antigen on their red blood cells and individuals that are Rh-negative (Rh–) do not. During normal childbirth, delivery trauma or complications during pregnancy, blood from a fetus can enter the mother\\'s system. In the case of an Rh-incompatible mother and child, consequential blood mixing may sensitize an Rh- mother to the Rh antigen on the blood cells of the Rh+ child, putting the remainder of the pregnancy, and any subsequent pregnancies, at risk for hemolytic disease of the newborn.Rho(D) immune globulin antibodies are specific for human RhD antigen. Anti-RhD antibodies are administered as part of a prenatal treatment regimen to prevent sensitization that may occur when a Rh-negative mother has a Rh-positive fetus. Treatment of a mother with Anti-RhD antibodies prior to and immediately after trauma and delivery destroys Rh antigen in the mother\\'s system from the fetus. It is important to note that this occurs before the antigen can stimulate maternal B cells to \"remember\" Rh antigen by generating memory B cells. Therefore, her humoral immune system will not make anti-Rh antibodies, and will not attack the Rh antigens of the current or subsequent babies. Rho(D) Immune Globulin treatment prevents sensitization that can lead to Rh disease, but does not prevent or treat the underlying disease itself.\\n\\n\\n== Research applications ==\\n\\nSpecific antibodies are produced by injecting an antigen into a mammal, such as a mouse, rat, rabbit, goat, sheep, or horse for large quantities of antibody. Blood isolated from these animals contains polyclonal antibodies—multiple antibodies that bind to the same antigen—in the serum, which can now be called antiserum. Antigens are also injected into chickens for generation of polyclonal antibodies in egg yolk. To obtain antibody that is specific for a single epitope of an antigen, antibody-secreting lymphocytes are isolated from the animal and immortalized by fusing them with a cancer cell line. The fused cells are called hybridomas, and will continually grow and secrete antibody in culture. Single hybridoma cells are isolated by dilution cloning to generate cell clones that all produce the same antibody; these antibodies are called monoclonal antibodies. Polyclonal and monoclonal antibodies are often purified using Protein A/G or antigen-affinity chromatography.In research, purified antibodies are used in many applications. Antibodies for research applications can be found directly from antibody suppliers, or through use of a specialist search engine. Research antibodies are most commonly used to identify and locate intracellular and extracellular proteins. Antibodies are used in flow cytometry to differentiate cell types by the proteins they express; different types of cells express different combinations of cluster of differentiation molecules on their surface, and produce different intracellular and secretable proteins. They are'],\n",
       " 'prompt': 'how are antibodies used in',\n",
       " 'ground_truth': 'An antibody (Ab), also known as an immunoglobulin (Ig), is a large Y-shaped protein produced by B-cells that is used by the immune system to identify and neutralize foreign objects such as bacteria and viruses .'}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def generate_response(row):\n",
    "    r = qe.query(row[\"question\"])\n",
    "    row[\"generated_text\"] = r.response\n",
    "    row[\"retrieved_context\"] = [sn.node.text for sn in r.source_nodes]\n",
    "\n",
    "    # some renamings for ragas\n",
    "    row[\"prompt\"] = row[\"question\"]\n",
    "    row[\"ground_truth\"] = row[\"answer\"]\n",
    "    return row\n",
    "\n",
    "\n",
    "generate_response(final_ds[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "d125c2e0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/56 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "final_ds_1 = final_ds.map(generate_response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "4c52d70c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "be184d5c5c4542d3a9c74f0b60d60922",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "092852cefffc499bbc3faff66d51f144",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "48d956fe01e04c5bb66b1394282eba64",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "final_ds_1.push_to_hub(\"explodinggradients/ragas-wikiqa\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
